Reduce
对指定维度进行归约。
- 输入:
src_data - 输入数据的地址
param - 算子计算所需参数的结构体。其各成员见下述。
core_mask - 核掩码。
ReduceParameter定义:
1typedef struct ReduceParameter {
2 void** data_buffers_; // 用于存储中间计算结果
3 int* outer_sizes_; // 处理某个规约轴时,该轴之前所有轴的元素数
4 int* inner_sizes_; // 某个规约轴之后的所有元素数
5 int* axis_sizes_; // 规约轴的元素数
6 int total_num_; // 输入张量的总元素数
7 int num_axes_; // 待规约轴的数目
8 int mode_; // 规约模式
9 int output_num_; // 输出张量的总元素数
10 /**该算子会根据ReduceParameter中的mode_参数选择实际规约所使用的方法。共有如下几种方法:
11 Reduce_Mean=0,
12 Reduce_Max=1,
13 Reduce_Min=2,
14 Reduce_Prod=3,
15 Reduce_Sum=4,
16 Reduce_SumSquare=5,
17 Reduce_ASum=6,
18 Reduce_L2Norm=7
19 **/
20} ReduceParameter;
- 输出:
dst_data - 输出地址。
- 支持平台:
FT78NEMT7004
备注
FT78NE 支持int8, int16, int32, fp32, fp64
MT7004 支持fp16, fp32, int16, int32
共享存储版本:
-
void i8_reduce_s(int8_t *src_data, int8_t *dst_data, ReduceParameter *param, int core_mask)
-
void i16_reduce_s(int16_t *src_data, half *dst_data, ReduceParameter *param, int core_mask)
-
void i32_reduce_s(int *src_data, float *dst_data, ReduceParameter *param, int core_mask)
-
void hp_reduce_s(half *src_data, half *dst_data, ReduceParameter *param, int core_mask)
-
void fp_reduce_s(float *src_data, float *dst_data, ReduceParameter *param, int core_mask)
-
void dp_reduce_s(double *src_data, double *dst_data, ReduceParameter *param, int core_mask)
C调用示例:
1void PackParam(ReduceParameter* param, int ndim, int* input_shape, int num_axes, int* axes) {
2 int tmp_input_shape[8];
3 int total_num = 1;
4 int i, j, k;
5 for (i = 0; i < ndim; i++) {
6 tmp_input_shape[i] = input_shape[i];
7 total_num *= input_shape[i];
8 }
9 param->total_num_ = total_num;
10 int offset_size = 0;
11 for (i = 0; i < num_axes; ++i) {
12 int axis = axes[i];
13 int outer_size = 1;
14 for (j = 0; j < axis; j++) {
15 outer_size *= tmp_input_shape[j];
16 }
17 param->outer_sizes_[offset_size] = outer_size;
18 int inner_size = 1;
19 for (k = axis + 1; k < ndim; k++) {
20 inner_size *= tmp_input_shape[k];
21 }
22 param->inner_sizes_[offset_size] = inner_size;
23 param->axis_sizes_[offset_size] = tmp_input_shape[axis];
24 offset_size++;
25 tmp_input_shape[axis] = 1;
26 }
27}
28
29void TestReduceSMCFp32(int* input_shape, int ndim, int* axes, int num_axes, int mode, int keep_dims, int core_mask) {
30 int core_id = get_core_id();
31 int logic_core_id = GetLogicCoreId(core_mask, core_id);
32 int core_num = GetCoreNum(core_mask);
33 float* input = (float*)0x88000000;
34 float* output = (float*)0x98000000;
35 ReduceParameter* param = (ReduceParameter*)0xA8480000;
36 if (logic_core_id == 0) {
37 param->num_axes_ = num_axes;
38 param->mode_ = mode;
39 param->data_buffers_ = (void**)0xA8483000;
40 param->inner_sizes_ = (int*)0xA8484000;
41 param->outer_sizes_ = (int*)0xA8485000;
42 param->axis_sizes_ = (int*)0xA8486000;
43 int i;
44 for (i = 0; i < num_axes - 1; i++) {
45 param->data_buffers_[i] = (void*)(0xA8490000 + 0x1000000);
46 }
47 PackParam(param, ndim, input_shape, num_axes, axes);
48 }
49 sys_bar(0, core_num); // 初始化参数完成后进行同步
50 fp_reduce_s(input, check, param, core_mask);
51}
52
53void main(){
54 int input_shape[3] = {4, 5, 5};
55 int ndim = 3;
56 int axes[1] = {1};
57 int num_axes = 1;
58 int mode = 7;
59 int keep_dims = 1;
60 int core_mask = 0b1111;
61 TestReduceSMCFp32(input_shape, ndim, axes, num_axes, mode, keep_dims, core_mask);
62}
私有存储版本:
-
void i8_reduce_p(int8_t *src_data, int8_t *dst_data, void *tmp_src_data, void *tmp_dst_data, ReduceParameter *param, int core_mask)
-
void i16_reduce_p(int16_t *src_data, half *dst_data, void *tmp_src_data, void *tmp_dst_data, ReduceParameter *param, int core_mask)
-
void i32_reduce_p(int *src_data, float *dst_data, void *tmp_src_data, void *tmp_dst_data, ReduceParameter *param, int core_mask)
-
void hp_reduce_p(half *src_data, half *dst_data, void *tmp_src_data, void *tmp_dst_data, ReduceParameter *param, int core_mask)
-
void fp_reduce_p(float *src_data, float *dst_data, void *tmp_src_data, void *tmp_dst_data, ReduceParameter *param, int core_mask)
-
void dp_reduce_p(double *src_data, double *dst_data, void *tmp_src_data, void *tmp_dst_data, ReduceParameter *param, int core_mask)
C调用示例:
1void PackParam(ReduceParameter* param, int ndim, int* input_shape, int num_axes, int* axes) {
2 int tmp_input_shape[8];
3 int total_num = 1;
4 int i, j, k;
5 for (i = 0; i < ndim; i++) {
6 tmp_input_shape[i] = input_shape[i];
7 total_num *= input_shape[i];
8 }
9 param->total_num_ = total_num;
10 int offset_size = 0;
11 for (i = 0; i < num_axes; ++i) {
12 int axis = axes[i];
13 int outer_size = 1;
14 for (j = 0; j < axis; j++) {
15 outer_size *= tmp_input_shape[j];
16 }
17 param->outer_sizes_[offset_size] = outer_size;
18 int inner_size = 1;
19 for (k = axis + 1; k < ndim; k++) {
20 inner_size *= tmp_input_shape[k];
21 }
22 param->inner_sizes_[offset_size] = inner_size;
23 param->axis_sizes_[offset_size] = tmp_input_shape[axis];
24 offset_size++;
25 tmp_input_shape[axis] = 1;
26 }
27}
28
29void TestReduceL2Fp32(int* input_shape, int ndim, int* axes, int num_axes, int mode, int keep_dims, int core_mask) {
30 float* input = (float*)0x10000000; // 原始输入输出数据需分配在AM中
31 float* output = (float*)0x10010000;
32 float* tmp_input = (float*)0x88000000; // 临时输入输出空间需分配在DDR或SMC中
33 float* tmp_output = (float*)0x98000000;
34 ReduceParameter* param = (ReduceParameter*)0x10020000;
35 param->num_axes_ = num_axes;
36 param->mode_ = mode;
37 param->data_buffers_ = (void**)0x10021000;
38 param->inner_sizes_ = (int*)0x10022000;
39 param->outer_sizes_ = (int*)0x10023000;
40 param->axis_sizes_ = (int*)0x10024000;
41 int i, j;
42 for (i = 0; i < ndim; i++) {
43 int reduce_axis = 0;
44 for (j = 0; j < num_axes; j++) {
45 if (axes[j] == i) {
46 reduce_axis = 1;
47 break;
48 }
49 }
50 if (!reduce_axis) {
51 length *= input_shape[i];
52 }
53 }
54 for (i = 0; i < num_axes - 1; i++) {
55 param->data_buffers_[i] = (void*)(0xA8490000 + 0x1000000); // 每一个中间计算结果空间都需分配在DDR或SMC中
56 }
57 param->output_num_ = length;
58 PackParam(param, ndim, input_shape, num_axes, axes);
59 fp_reduce_p(input, check, param, core_mask);
60}
61
62void main() {
63 int input_shape[3] = {4, 5, 5};
64 int ndim = 3;
65 int axes[1] = {1};
66 int num_axes = 1;
67 int mode = 7;
68 int keep_dims = 1;
69 int core_mask = 0b0001; // 私有存储版本只能设置为一个核心启动
70 TestReduceL2Fp32(input_shape, ndim, axes, num_axes, mode, keep_dims, core_mask);
71}